{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Script for creating subset of articles containing atleast one graph database company and storing it in a collection on local system"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pymongo import MongoClient"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from bson import ObjectId"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "local = MongoClient()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "gem = MongoClient('act4dgem.cse.iitd.ac.in')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "articlesColl = gem.get_database('media-db').get_collection('articles')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "resolved_entities = gem.get_database('eventwise_media-db').get_collection('all_media_entities_resolved_final3')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "organizationArticles = local.get_database('media-db').get_collection('articlesWithGraphOrg2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from py2neo import Graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "graph = Graph(host='10.237.27.115')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "getName = 'match (x) where x.uuid = {} return x.name as name'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "isCompany = 'match (x) where x.uuid = {} return \"company\" in labels(x) as res'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "getCompany = 'match (x:company) return x.cin as cin, x.name as name, x.uuid as uuid'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "companyDf = pd.DataFrame(graph.data(getCompany))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(93492, 3)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "companyDf.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cin</th>\n",
       "      <th>name</th>\n",
       "      <th>uuid</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>L00000CH1983PLC031318</td>\n",
       "      <td>SAB INDUSTRIES LIMITED</td>\n",
       "      <td>154241</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>L00000CH1990PLC010573</td>\n",
       "      <td>LAKSHMI ENERGY AND FOODS LIMITED</td>\n",
       "      <td>154242</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>L00000HP1994PLC014770</td>\n",
       "      <td>HITKARI INDUSTRIES LIMITED</td>\n",
       "      <td>154243</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>L00000MH1946PLC197474</td>\n",
       "      <td>QUADRANT TELEVENTURES LIMITED</td>\n",
       "      <td>154244</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>L01100MH1996PLC100380</td>\n",
       "      <td>USHER AGRO LIMITED</td>\n",
       "      <td>154245</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                     cin                              name    uuid\n",
       "0  L00000CH1983PLC031318            SAB INDUSTRIES LIMITED  154241\n",
       "1  L00000CH1990PLC010573  LAKSHMI ENERGY AND FOODS LIMITED  154242\n",
       "2  L00000HP1994PLC014770        HITKARI INDUSTRIES LIMITED  154243\n",
       "3  L00000MH1946PLC197474     QUADRANT TELEVENTURES LIMITED  154244\n",
       "4  L01100MH1996PLC100380                USHER AGRO LIMITED  154245"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "companyDf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "companyDf.index = companyDf.uuid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>cin</th>\n",
       "      <th>name</th>\n",
       "      <th>uuid</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>uuid</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>154241</th>\n",
       "      <td>L00000CH1983PLC031318</td>\n",
       "      <td>SAB INDUSTRIES LIMITED</td>\n",
       "      <td>154241</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>154242</th>\n",
       "      <td>L00000CH1990PLC010573</td>\n",
       "      <td>LAKSHMI ENERGY AND FOODS LIMITED</td>\n",
       "      <td>154242</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>154243</th>\n",
       "      <td>L00000HP1994PLC014770</td>\n",
       "      <td>HITKARI INDUSTRIES LIMITED</td>\n",
       "      <td>154243</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>154244</th>\n",
       "      <td>L00000MH1946PLC197474</td>\n",
       "      <td>QUADRANT TELEVENTURES LIMITED</td>\n",
       "      <td>154244</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>154245</th>\n",
       "      <td>L01100MH1996PLC100380</td>\n",
       "      <td>USHER AGRO LIMITED</td>\n",
       "      <td>154245</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                          cin                              name    uuid\n",
       "uuid                                                                   \n",
       "154241  L00000CH1983PLC031318            SAB INDUSTRIES LIMITED  154241\n",
       "154242  L00000CH1990PLC010573  LAKSHMI ENERGY AND FOODS LIMITED  154242\n",
       "154243  L00000HP1994PLC014770        HITKARI INDUSTRIES LIMITED  154243\n",
       "154244  L00000MH1946PLC197474     QUADRANT TELEVENTURES LIMITED  154244\n",
       "154245  L01100MH1996PLC100380                USHER AGRO LIMITED  154245"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "companyDf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "uuidToNameDict = companyDf['name'].to_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "93492"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(uuidToNameDict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "c = 0\n",
    "articles_covered = []\n",
    "articles_count = 0\n",
    "entities_resolved_with_graph = resolved_entities.find({'graphid':{'$exists':True}}, no_cursor_timeout=True)\n",
    "for entity in entities_resolved_with_graph:\n",
    "    articleIds = entity['articleIds']\n",
    "    graphid = entity['graphid']\n",
    "    type = entity['type']\n",
    "    if not uuidToNameDict.get(graphid, None):\n",
    "        continue\n",
    "    c += 1\n",
    "    articles_covered += articleIds\n",
    "    for aId in articleIds:\n",
    "        doc = articlesColl.find_one({'_id':ObjectId(aId)})\n",
    "        if not organizationArticles.find_one({'_id':ObjectId(aId)}):\n",
    "            articles_count += 1\n",
    "            organizationArticles.insert_one(doc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "93364 34258 34258\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "34258"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "print(c, articles_count, len(set(articles_covered)))\n",
    "organizationArticles.find({}).count()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
